{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import re\n",
    "from pprint import pprint\n",
    "qc_raw = pd.read_excel(\"zfssz3.result.xlsx\", sheet_name=\"QC信息表\")\n",
    "seq_raw = pd.read_excel(\"sequence.20200428.xlsx\", sheet_name=\"测序信息表\")\n",
    "pooling_raw = pd.read_excel(\"pooling.20200428.xlsx\", sheet_name=\"pooling\")\n",
    "capture_raw = pd.read_excel(\"pooling.20200428.xlsx\", sheet_name=\"capture\")\n",
    "library_raw = pd.read_excel(\"library.20200424.v2.xlsx\", sheet_name=0)\n",
    "extract_raw = pd.read_excel(\"extract.20200430.xlsx\", sheet_name=\"merge\")\n",
    "clinical_raw = pd.read_excel(\"clinical.20200430.xlsx\", sheet_name=\"merge\")\n",
    "sampleId_unique_overlap = pd.read_excel(\"library.20200424.v2.xlsx\", sheet_name=1).loc[:,\"样本编号\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "<bound method NDFrame.head of             样本编号   姓名   性别    年龄      住院号     癌种   分期          诊断  \\\n0     19P3758198  李春萱    女  57.0  19P0501    NaN  NaN      BGI健康人   \n1     19P3758199   蔡流    男  60.0  19P0502    NaN  NaN      BGI健康人   \n2     19P3758200   钟燕    女  57.0  19P0503    NaN  NaN      BGI健康人   \n3     19P3758201  彭小中    女  51.0  19P0504    NaN  NaN      BGI健康人   \n4     19P3758202  韩桂芬    女  60.0  19P0505    NaN  NaN      BGI健康人   \n...          ...  ...  ...   ...      ...    ...  ...         ...   \n1033  19P5803657  杜亚洲    男  49.0  1037698  肝癌MRD    A  肝右叶原发性肝癌     \n1034  19P5803658  叶子才    男  65.0  1039390  肝癌MRD    A    肝占位性病变     \n1035  19P5803659  孙堂进    男  75.0  1039058  肝癌MRD    A     原发性肝癌     \n1036  19P5803660  陈星伟    男  75.0  1038419  肝癌MRD    A     原发性肝癌     \n1037  19P6714067  周竟成  NaN   NaN  1010495  肝癌MRD  NaN         NaN   \n\n                                                   诊断备注       采样日期     医院编号  \\\n0                                    肿瘤患病史: 无, 家族肿瘤史: 无 2019-04-24   BGI健康人   \n1                                    肿瘤患病史: 无, 家族肿瘤史: 无 2019-04-24   BGI健康人   \n2                                    肿瘤患病史: 无, 家族肿瘤史: 无 2019-04-24   BGI健康人   \n3                                    肿瘤患病史: 无, 家族肿瘤史: 无 2019-04-24   BGI健康人   \n4                                    肿瘤患病史: 无, 家族肿瘤史: 无 2019-04-24   BGI健康人   \n...                                                 ...        ...      ...   \n1033  肝硬化: 中度; 肝硬化结节: 0.3,0.5; 甲胎蛋白: 13.9; 乙肝病毒表面抗原:...        NaT  中山肝癌MRD   \n1034  肝硬化: 中度; 肝硬化结节: 0.5; 甲胎蛋白: 2.2; 乙肝病毒表面抗原: (-)0...        NaT  中山肝癌MRD   \n1035  肝硬化: 中度; 肝硬化结节: 0.3,0.6; 甲胎蛋白: 105.6; 乙肝病毒表面抗原...        NaT  中山肝癌MRD   \n1036  肝硬化: 无; 肝硬化结节: 无; 甲胎蛋白: 3.5; 乙肝病毒表面抗原: (-)0.50...        NaT  中山肝癌MRD   \n1037  肝硬化: 无; 肝硬化结节: 无; 甲胎蛋白: 无; 乙肝病毒表面抗原: 无; 乙肝病毒表面...        NaT  中山肝癌MRD   \n\n       科室  血浆管数  癌旁组织  癌组织  白细胞  粪便       寄送日期  \\\n0     NaN     1   0.0  0.0    0   0 2019-04-24   \n1     NaN     1   0.0  0.0    1   0 2019-04-24   \n2     NaN     1   0.0  0.0    0   0 2019-04-24   \n3     NaN     1   0.0  0.0    0   0 2019-04-24   \n4     NaN     1   0.0  0.0    0   0 2019-04-24   \n...   ...   ...   ...  ...  ...  ..        ...   \n1033  NaN     2   1.0  1.0    1   0        NaT   \n1034  NaN     1   1.0  1.0    1   0        NaT   \n1035  NaN     2   1.0  1.0    1   0        NaT   \n1036  NaN     2   1.0  1.0    1   0        NaT   \n1037  NaN     1   0.0  0.0    0   0        NaT   \n\n                                                     备注  \n0     员工姓名: 张笑薇, 手机号: 18682465726.0, 给谁报名: 母亲, 电子邮箱地...  \n1     员工姓名: 邓玉梅, 手机号: 13798515806.0, 给谁报名: 父亲, 电子邮箱地...  \n2     员工姓名: 邓玉梅, 手机号: 13798515806.0, 给谁报名: 母亲, 电子邮箱地...  \n3     员工姓名: 柳慧, 手机号: 15625286487.0, 给谁报名: 母亲, 电子邮箱地址...  \n4     员工姓名: 李玉梅, 手机号: 13410176807.0, 给谁报名: 其他, 电子邮箱地...  \n...                                                 ...  \n1033                                                  无  \n1034                                                  无  \n1035                                                  无  \n1036                                                  无  \n1037                                                  无  \n\n[1038 rows x 19 columns]>"
     },
     "metadata": {},
     "execution_count": 2
    }
   ],
   "source": [
    "clinical_raw.head"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "<bound method NDFrame.head of             样本编号   姓名   性别    年龄      住院号     癌种   分期          诊断  \\\n0     19P3758198  李春萱    女  57.0  19P0501    NaN  NaN      BGI健康人   \n1     19P3758199   蔡流    男  60.0  19P0502    NaN  NaN      BGI健康人   \n2     19P3758200   钟燕    女  57.0  19P0503    NaN  NaN      BGI健康人   \n3     19P3758201  彭小中    女  51.0  19P0504    NaN  NaN      BGI健康人   \n4     19P3758202  韩桂芬    女  60.0  19P0505    NaN  NaN      BGI健康人   \n...          ...  ...  ...   ...      ...    ...  ...         ...   \n1033  19P5803657  杜亚洲    男  49.0  1037698  肝癌MRD    A  肝右叶原发性肝癌     \n1034  19P5803658  叶子才    男  65.0  1039390  肝癌MRD    A    肝占位性病变     \n1035  19P5803659  孙堂进    男  75.0  1039058  肝癌MRD    A     原发性肝癌     \n1036  19P5803660  陈星伟    男  75.0  1038419  肝癌MRD    A     原发性肝癌     \n1037  19P6714067  周竟成  NaN   NaN  1010495  肝癌MRD  NaN         NaN   \n\n                                                   诊断备注       采样日期     医院编号  \\\n0                                    肿瘤患病史: 无, 家族肿瘤史: 无 2019-04-24   BGI健康人   \n1                                    肿瘤患病史: 无, 家族肿瘤史: 无 2019-04-24   BGI健康人   \n2                                    肿瘤患病史: 无, 家族肿瘤史: 无 2019-04-24   BGI健康人   \n3                                    肿瘤患病史: 无, 家族肿瘤史: 无 2019-04-24   BGI健康人   \n4                                    肿瘤患病史: 无, 家族肿瘤史: 无 2019-04-24   BGI健康人   \n...                                                 ...        ...      ...   \n1033  肝硬化: 中度; 肝硬化结节: 0.3,0.5; 甲胎蛋白: 13.9; 乙肝病毒表面抗原:...        NaT  中山肝癌MRD   \n1034  肝硬化: 中度; 肝硬化结节: 0.5; 甲胎蛋白: 2.2; 乙肝病毒表面抗原: (-)0...        NaT  中山肝癌MRD   \n1035  肝硬化: 中度; 肝硬化结节: 0.3,0.6; 甲胎蛋白: 105.6; 乙肝病毒表面抗原...        NaT  中山肝癌MRD   \n1036  肝硬化: 无; 肝硬化结节: 无; 甲胎蛋白: 3.5; 乙肝病毒表面抗原: (-)0.50...        NaT  中山肝癌MRD   \n1037  肝硬化: 无; 肝硬化结节: 无; 甲胎蛋白: 无; 乙肝病毒表面抗原: 无; 乙肝病毒表面...        NaT  中山肝癌MRD   \n\n       科室  血浆管数  癌旁组织  癌组织  白细胞  粪便       寄送日期  \\\n0     NaN     1   0.0  0.0    0   0 2019-04-24   \n1     NaN     1   0.0  0.0    1   0 2019-04-24   \n2     NaN     1   0.0  0.0    0   0 2019-04-24   \n3     NaN     1   0.0  0.0    0   0 2019-04-24   \n4     NaN     1   0.0  0.0    0   0 2019-04-24   \n...   ...   ...   ...  ...  ...  ..        ...   \n1033  NaN     2   1.0  1.0    1   0        NaT   \n1034  NaN     1   1.0  1.0    1   0        NaT   \n1035  NaN     2   1.0  1.0    1   0        NaT   \n1036  NaN     2   1.0  1.0    1   0        NaT   \n1037  NaN     1   0.0  0.0    0   0        NaT   \n\n                                                     备注  \n0     员工姓名: 张笑薇, 手机号: 18682465726.0, 给谁报名: 母亲, 电子邮箱地...  \n1     员工姓名: 邓玉梅, 手机号: 13798515806.0, 给谁报名: 父亲, 电子邮箱地...  \n2     员工姓名: 邓玉梅, 手机号: 13798515806.0, 给谁报名: 母亲, 电子邮箱地...  \n3     员工姓名: 柳慧, 手机号: 15625286487.0, 给谁报名: 母亲, 电子邮箱地址...  \n4     员工姓名: 李玉梅, 手机号: 13410176807.0, 给谁报名: 其他, 电子邮箱地...  \n...                                                 ...  \n1033                                                  无  \n1034                                                  无  \n1035                                                  无  \n1036                                                  无  \n1037                                                  无  \n\n[1038 rows x 19 columns]>"
     },
     "metadata": {},
     "execution_count": 3
    }
   ],
   "source": [
    "clinical_raw_filt = clinical_raw.loc[clinical_raw.loc[:, \"样本编号\"].isin(sampleId_unique_overlap),:]\n",
    "clinical_raw_filt.head\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "<bound method NDFrame.head of            DNA提取编号        样本编号       提取日期 样本类型 样本体积     提取方法     浓度    体积  备注\n0     19P3758198_1  19P3758198 2019-04-24   血浆    9      NaN  0.608  73.0 NaN\n1     19P3758199_1  19P3758199 2019-04-24   血浆    5      NaN  0.928  42.0 NaN\n2     19P3758200_1  19P3758200 2019-04-24   血浆  5.5      NaN  0.772  42.0 NaN\n3     19P3758201_1  19P3758201 2019-04-24   血浆  9.5      NaN  0.540  73.0 NaN\n4     19P3758202_1  19P3758202 2019-04-24   血浆  9.1      NaN  0.470  73.0 NaN\n...            ...         ...        ...  ...  ...      ...    ...   ...  ..\n1020  19P5803657_1  19P5803657        NaT   血浆    4  美基XP双选法  6.140  43.0 NaN\n1021  19P5803655_1  19P5803655        NaT   血浆    4  美基XP双选法  8.140  43.0 NaN\n1022  19P5803660_1  19P5803660        NaT   血浆    4  美基XP双选法  1.530  43.0 NaN\n1023  19P5803659_1  19P5803659        NaT   血浆    4  美基XP双选法  4.060  43.0 NaN\n1024  19P5803658_1  19P5803658        NaT   血浆    4  美基XP双选法  8.600  43.0 NaN\n\n[1025 rows x 9 columns]>"
     },
     "metadata": {},
     "execution_count": 4
    }
   ],
   "source": [
    "extract_raw.head"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "<bound method NDFrame.head of            DNA提取编号        样本编号       提取日期 样本类型 样本体积     提取方法     浓度    体积  备注\n0     19P3758198_1  19P3758198 2019-04-24   血浆    9      NaN  0.608  73.0 NaN\n1     19P3758199_1  19P3758199 2019-04-24   血浆    5      NaN  0.928  42.0 NaN\n2     19P3758200_1  19P3758200 2019-04-24   血浆  5.5      NaN  0.772  42.0 NaN\n3     19P3758201_1  19P3758201 2019-04-24   血浆  9.5      NaN  0.540  73.0 NaN\n4     19P3758202_1  19P3758202 2019-04-24   血浆  9.1      NaN  0.470  73.0 NaN\n...            ...         ...        ...  ...  ...      ...    ...   ...  ..\n1020  19P5803657_1  19P5803657        NaT   血浆    4  美基XP双选法  6.140  43.0 NaN\n1021  19P5803655_1  19P5803655        NaT   血浆    4  美基XP双选法  8.140  43.0 NaN\n1022  19P5803660_1  19P5803660        NaT   血浆    4  美基XP双选法  1.530  43.0 NaN\n1023  19P5803659_1  19P5803659        NaT   血浆    4  美基XP双选法  4.060  43.0 NaN\n1024  19P5803658_1  19P5803658        NaT   血浆    4  美基XP双选法  8.600  43.0 NaN\n\n[1025 rows x 9 columns]>"
     },
     "metadata": {},
     "execution_count": 5
    }
   ],
   "source": [
    "extract_raw_filt = extract_raw.loc[extract_raw.loc[:, \"样本编号\"].isin(sampleId_unique_overlap), :]\n",
    "extract_raw_filt.head"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "<bound method NDFrame.head of                  建库编号       DNA提取编号 管上编号    是否临床        文库名 样本标签 index列表  \\\n0     18S4146706_1_m1  18S4146706_1    1  Clinic      ssM-1    P   57,58   \n1     18S4146707_1_m1  18S4146707_1    2  Clinic      ssM-2    P   59,60   \n2     18S4146708_1_m1  18S4146708_1    3  Clinic      ssM-3    P   61,62   \n3     18S4146709_1_m1  18S4146709_1    4  Clinic      ssM-4    P   63,64   \n4     18S4146710_1_m1  18S4146710_1    5  Clinic      ssM-5    P   65,66   \n...               ...           ...  ...     ...        ...  ...     ...   \n1100  20L5383633_1_m1  20L5383633_1   12  Clinic  ssM-12-pl    T      68   \n1101  20S5383594_1_m1  20S5383594_1   13  Clinic  ssM-13-pl    T      69   \n1102  20L5383634_1_m1  20L5383634_1   14  Clinic  ssM-14-pl    T      70   \n1103  20S5383595_1_m1  20S5383595_1   15  Clinic  ssM-15-pl    T      71   \n1104  20L5383635_1_m1  20L5383635_1   16  Clinic  ssM-16-pl    T      72   \n\n            建库日期  建库方法    试剂批次  ...                               杂交探针  杂交时间h  \\\n0     2018/12/11   1.0       无  ...  SeqCap Epi CpGiant Enrichment Kit   66.5   \n1     2018/12/11   1.0       无  ...  SeqCap Epi CpGiant Enrichment Kit   66.5   \n2     2018/12/11   1.0       无  ...  SeqCap Epi CpGiant Enrichment Kit   66.5   \n3     2018/12/11   1.0       无  ...  SeqCap Epi CpGiant Enrichment Kit   66.5   \n4     2018/12/11   1.0       无  ...  SeqCap Epi CpGiant Enrichment Kit   67.5   \n...          ...   ...     ...  ...                                ...    ...   \n1100  2020/02/10   2.0  Lot009  ...  SeqCap Epi CpGiant Enrichment Kit   70.5   \n1101  2020/02/10   2.0  Lot009  ...  SeqCap Epi CpGiant Enrichment Kit   70.5   \n1102  2020/02/10   2.0  Lot009  ...  SeqCap Epi CpGiant Enrichment Kit   70.5   \n1103  2020/02/10   2.0  Lot009  ...  SeqCap Epi CpGiant Enrichment Kit   70.5   \n1104  2020/02/10   2.0  Lot009  ...  SeqCap Epi CpGiant Enrichment Kit   70.5   \n\n      PostPCR循环数 PostPCR浓度ng/ul  洗脱体积 文库质控                    文库名称  \\\n0             12           16.2  40.0    -  ssM-CpGiant-1-20181218   \n1             12           16.2  40.0    -  ssM-CpGiant-1-20181218   \n2             12           16.2  40.0    -  ssM-CpGiant-1-20181218   \n3             12           16.2  40.0    -  ssM-CpGiant-1-20181218   \n4             13           17.6  40.0    -  ssM-CpGiant-2-20181218   \n...          ...            ...   ...  ...                     ...   \n1100           9           51.6  40.0    -  ssM-CpGiant-6-20200214   \n1101           9           51.6  40.0    -  ssM-CpGiant-6-20200214   \n1102           9           51.6  40.0    -  ssM-CpGiant-6-20200214   \n1103           9           51.6  40.0    -  ssM-CpGiant-6-20200214   \n1104           9           51.6  40.0    -  ssM-CpGiant-6-20200214   \n\n         华大样本编号P 捕获文库操作人        杂交日期  \n0     18P4146706     NaN  2018/12/18  \n1     18P4146707     NaN  2018/12/18  \n2     18P4146708     NaN  2018/12/18  \n3     18P4146709     NaN  2018/12/18  \n4     18P4146710     NaN  2018/12/18  \n...          ...     ...         ...  \n1100  20P5383633     LZL  2020/02/14  \n1101  20P5383594     LZL  2020/02/14  \n1102  20P5383634     LZL  2020/02/14  \n1103  20P5383595     LZL  2020/02/14  \n1104  20P5383635     LZL  2020/02/14  \n\n[1105 rows x 34 columns]>"
     },
     "metadata": {},
     "execution_count": 6
    }
   ],
   "source": [
    "library_raw.head"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "<bound method NDFrame.head of                  建库编号       DNA提取编号 管上编号    是否临床        文库名 样本标签 index列表  \\\n10    18P4063823_1_m1  18P4063823_1    5  Clinic          5    P   65,66   \n11    18P4063845_1_m1  18P4063845_1   16  Clinic         16    P   71,72   \n12    18P4063830_1_m1  18P4063830_1   23  Clinic     ssM-23    P   67,68   \n13    18P4063833_1_m1  18P4063833_1   31  Clinic     ssM-31    P   69,70   \n14    18P4063839_1_m1  18P4063839_1   38  Clinic     ssM-38    P   63,64   \n...               ...           ...  ...     ...        ...  ...     ...   \n1060  19P7247863_1_m1  19P7247863_1   20  Clinic  ssM-20-pl    P      76   \n1061  19P7247850_1_m1  19P7247850_1   21  Clinic  ssM-21-pl    P      77   \n1062  19P7247862_1_m1  19P7247862_1   22  Clinic  ssM-22-pl    P      78   \n1063  19P7247867_1_m1  19P7247867_1   23  Clinic  ssM-23-pl    P      79   \n1064  19P7247884_1_m1  19P7247884_1   24  Clinic  ssM-24-pl    P      80   \n\n            建库日期  建库方法    试剂批次  ...                               杂交探针  杂交时间h  \\\n10    2018/12/26   1.0       无  ...  SeqCap Epi CpGiant Enrichment Kit     67   \n11    2018/12/26   1.0       无  ...  SeqCap Epi CpGiant Enrichment Kit     67   \n12    2018/12/26   1.0       无  ...  SeqCap Epi CpGiant Enrichment Kit     70   \n13    2018/12/26   1.0       无  ...  SeqCap Epi CpGiant Enrichment Kit     70   \n14    2018/12/26   1.0       无  ...  SeqCap Epi CpGiant Enrichment Kit     70   \n...          ...   ...     ...  ...                                ...    ...   \n1060  2020/01/22   2.2  Lot008  ...  SeqCap Epi CpGiant Enrichment Kit   65.5   \n1061  2020/01/22   2.2  Lot008  ...  SeqCap Epi CpGiant Enrichment Kit   65.5   \n1062  2020/01/22   2.2  Lot008  ...  SeqCap Epi CpGiant Enrichment Kit   65.5   \n1063  2020/01/22   2.2  Lot008  ...  SeqCap Epi CpGiant Enrichment Kit   65.5   \n1064  2020/01/22   2.2  Lot008  ...  SeqCap Epi CpGiant Enrichment Kit   65.5   \n\n      PostPCR循环数 PostPCR浓度ng/ul  洗脱体积 文库质控                    文库名称  \\\n10            12            110  40.0    -  ssM-CpGiant-4-20190104   \n11            12            110  40.0    -  ssM-CpGiant-4-20190104   \n12            11           38.8  40.0    -  ssM-CpGiant-2-20190109   \n13            11           59.8  40.0    -  ssM-CpGiant-3-20190109   \n14            11           48.8  40.0    -  ssM-CpGiant-4-20190109   \n...          ...            ...   ...  ...                     ...   \n1060          10           89.2  40.0    -  ssM-CpGiant-8-20200131   \n1061          10           89.2  40.0    -  ssM-CpGiant-8-20200131   \n1062          10           89.2  40.0    -  ssM-CpGiant-8-20200131   \n1063          10           89.2  40.0    -  ssM-CpGiant-8-20200131   \n1064          10           89.2  40.0    -  ssM-CpGiant-8-20200131   \n\n         华大样本编号P 捕获文库操作人        杂交日期  \n10    18P4063823     NaN  2019/01/04  \n11    18P4063845     NaN  2019/01/04  \n12    18P4063830     NaN  2019/01/09  \n13    18P4063833     NaN  2019/01/09  \n14    18P4063839     NaN  2019/01/09  \n...          ...     ...         ...  \n1060  19P7247863     LZL  2020/01/31  \n1061  19P7247850     LZL  2020/01/31  \n1062  19P7247862     LZL  2020/01/31  \n1063  19P7247867     LZL  2020/01/31  \n1064  19P7247884     LZL  2020/01/31  \n\n[995 rows x 34 columns]>"
     },
     "metadata": {},
     "execution_count": 7
    }
   ],
   "source": [
    "library_raw_filt = library_raw.loc[library_raw.loc[:, \"DNA提取编号\"].isin(extract_raw_filt.loc[:, \"DNA提取编号\"]), :]\n",
    "library_raw_filt.head"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "count = {}\n",
    "tmp = []\n",
    "for idx in range(pooling_raw.shape[0]):\n",
    "    k = pooling_raw.loc[idx, \"建库编号\"]\n",
    "    count[k] = 1+count[k] if k in count else 1 \n",
    "    tmp.append(\"{}_{}\".format(k, count[k]))\n",
    "pooling_raw.loc[:, \"测序文库名\"] = tmp\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "<bound method NDFrame.head of                  建库编号                    文库名称  虚拟入库样本名  pooling比例  取样ng  \\\n0     18S4146706_1_m1  ssM-CpGiant-1-20181218  IDTL130        1.0   250   \n1     18S4146707_1_m1  ssM-CpGiant-1-20181218  IDTL130        1.0   250   \n2     18S4146708_1_m1  ssM-CpGiant-1-20181218  IDTL130        1.0   250   \n3     18S4146709_1_m1  ssM-CpGiant-1-20181218  IDTL130        1.0   250   \n4     18S4146710_1_m1  ssM-CpGiant-2-20181218  IDTL131        1.0   250   \n...               ...                     ...      ...        ...   ...   \n1078  20L5383633_1_m1  ssM-CpGiant-6-20200214    LM310        1.0   250   \n1079  20S5383594_1_m1  ssM-CpGiant-6-20200214    LM310        1.0   250   \n1080  20L5383634_1_m1  ssM-CpGiant-6-20200214    LM310        1.0   250   \n1081  20S5383595_1_m1  ssM-CpGiant-6-20200214    LM310        1.0   250   \n1082  20L5383635_1_m1  ssM-CpGiant-6-20200214    LM310        1.0   250   \n\n           体积ul              测序文库名  \n0      4.166667  18S4146706_1_m1_1  \n1      3.571429  18S4146707_1_m1_1  \n2      4.921260  18S4146708_1_m1_1  \n3      4.251701  18S4146709_1_m1_1  \n4      4.006410  18S4146710_1_m1_1  \n...         ...                ...  \n1078  10.416667  20L5383633_1_m1_1  \n1079  10.683761  20S5383594_1_m1_1  \n1080  13.297872  20L5383634_1_m1_1  \n1081  10.162602  20S5383595_1_m1_1  \n1082   9.920635  20L5383635_1_m1_1  \n\n[1083 rows x 7 columns]>"
     },
     "metadata": {},
     "execution_count": 13
    }
   ],
   "source": [
    "pooling_raw.head"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "<bound method NDFrame.head of                  建库编号                    文库名称  虚拟入库样本名  pooling比例  取样ng  \\\n10    18P4063823_1_m1  ssM-CpGiant-4-20190104  IDTL145        1.0   500   \n11    18P4063845_1_m1  ssM-CpGiant-4-20190104  IDTL145        1.0   500   \n12    18P4063830_1_m1  ssM-CpGiant-2-20190109    LM001        1.0   500   \n13    18P4063833_1_m1  ssM-CpGiant-3-20190109    LM002        1.0   500   \n14    18P4063839_1_m1  ssM-CpGiant-4-20190109    LM002        1.0   500   \n...               ...                     ...      ...        ...   ...   \n1038  19P7247863_1_m1  ssM-CpGiant-8-20200131    LM289        1.0   250   \n1039  19P7247850_1_m1  ssM-CpGiant-8-20200131    LM289        1.0   250   \n1040  19P7247862_1_m1  ssM-CpGiant-8-20200131    LM289        1.0   250   \n1041  19P7247867_1_m1  ssM-CpGiant-8-20200131    LM289        1.0   250   \n1042  19P7247884_1_m1  ssM-CpGiant-8-20200131    LM289        1.0   250   \n\n           体积ul              测序文库名  \n10     9.505703  18P4063823_1_m1_1  \n11    10.822511  18P4063845_1_m1_1  \n12    13.440860  18P4063830_1_m1_1  \n13    12.500000  18P4063833_1_m1_1  \n14    12.562814  18P4063839_1_m1_1  \n...         ...                ...  \n1038  10.964912  19P7247863_1_m1_1  \n1039   9.920635  19P7247850_1_m1_1  \n1040  13.736264  19P7247862_1_m1_1  \n1041  10.869565  19P7247867_1_m1_1  \n1042  10.162602  19P7247884_1_m1_1  \n\n[973 rows x 7 columns]>"
     },
     "metadata": {},
     "execution_count": 14
    }
   ],
   "source": [
    "pooling_raw_filt = pooling_raw.loc[pooling_raw.loc[:, \"建库编号\"].isin(library_raw_filt.loc[:, \"建库编号\"]), :]\n",
    "pooling_raw_filt.head"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# pooling_raw_filt.loc[:,\"文库名称\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "<bound method NDFrame.head of                        文库名称  虚拟入库样本名        杂交日期  \\\n0    ssM-CpGiant-1-20181218  IDTL130  2018/12/18   \n1    ssM-CpGiant-2-20181218  IDTL131  2018/12/18   \n2    ssM-CpGiant-1-20190104  IDTL142  2019/01/04   \n3    ssM-CpGiant-4-20190104  IDTL145  2019/01/04   \n4    ssM-CpGiant-2-20190109    LM001  2019/01/09   \n..                      ...      ...         ...   \n212  ssM-CpGiant-2-20200214    LM306  2020/02/14   \n213  ssM-CpGiant-3-20200214    LM307  2020/02/14   \n214  ssM-CpGiant-4-20200214    LM308  2020/02/14   \n215  ssM-CpGiant-5-20200214    LM309  2020/02/14   \n216  ssM-CpGiant-6-20200214    LM310  2020/02/14   \n\n                                  杂交探针  杂交时间h  PostPCR循环数 PostPCR浓度ng/ul  \\\n0    SeqCap Epi CpGiant Enrichment Kit   66.5          12           16.2   \n1    SeqCap Epi CpGiant Enrichment Kit   67.5          13           17.6   \n2    SeqCap Epi CpGiant Enrichment Kit   67.0          12            110   \n3    SeqCap Epi CpGiant Enrichment Kit   67.0          12            110   \n4    SeqCap Epi CpGiant Enrichment Kit   70.0          11           38.8   \n..                                 ...    ...         ...            ...   \n212  SeqCap Epi CpGiant Enrichment Kit   70.5           9             46   \n213  SeqCap Epi CpGiant Enrichment Kit   70.5           9           48.8   \n214  SeqCap Epi CpGiant Enrichment Kit   70.5           9           55.6   \n215  SeqCap Epi CpGiant Enrichment Kit   70.5           9             56   \n216  SeqCap Epi CpGiant Enrichment Kit   70.5           9           51.6   \n\n     洗脱体积  操作人  备注  \n0    40.0  NaN NaN  \n1    40.0  NaN NaN  \n2    40.0  NaN NaN  \n3    40.0  NaN NaN  \n4    40.0  NaN NaN  \n..    ...  ...  ..  \n212  40.0  LZL NaN  \n213  40.0  LZL NaN  \n214  40.0  LZL NaN  \n215  40.0  LZL NaN  \n216  40.0  LZL NaN  \n\n[217 rows x 10 columns]>"
     },
     "metadata": {},
     "execution_count": 16
    }
   ],
   "source": [
    "capture_raw.head"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "<bound method NDFrame.head of                        文库名称  虚拟入库样本名        杂交日期  \\\n3    ssM-CpGiant-4-20190104  IDTL145  2019/01/04   \n4    ssM-CpGiant-2-20190109    LM001  2019/01/09   \n5    ssM-CpGiant-3-20190109    LM002  2019/01/09   \n6    ssM-CpGiant-4-20190109    LM002  2019/01/09   \n7    ssM-CpGiant-5-20190109    LM003  2019/01/09   \n..                      ...      ...         ...   \n207  ssM-CpGiant-6-20200131    LM287  2020/01/31   \n208  ssM-CpGiant-7-20200131    LM287  2020/01/31   \n209  ssM-CpGiant-6-20200131    LM319  2020/01/31   \n210  ssM-CpGiant-7-20200131    LM319  2020/01/31   \n211  ssM-CpGiant-8-20200131    LM289  2020/01/31   \n\n                                  杂交探针  杂交时间h  PostPCR循环数 PostPCR浓度ng/ul  \\\n3    SeqCap Epi CpGiant Enrichment Kit   67.0          12            110   \n4    SeqCap Epi CpGiant Enrichment Kit   70.0          11           38.8   \n5    SeqCap Epi CpGiant Enrichment Kit   70.0          11           59.8   \n6    SeqCap Epi CpGiant Enrichment Kit   70.0          11           48.8   \n7    SeqCap Epi CpGiant Enrichment Kit   70.0          11              0   \n..                                 ...    ...         ...            ...   \n207  SeqCap Epi CpGiant Enrichment Kit   65.5          10           77.8   \n208  SeqCap Epi CpGiant Enrichment Kit   65.5          10           72.2   \n209  SeqCap Epi CpGiant Enrichment Kit   65.5          10           77.8   \n210  SeqCap Epi CpGiant Enrichment Kit   65.5          10           72.2   \n211  SeqCap Epi CpGiant Enrichment Kit   65.5          10           89.2   \n\n     洗脱体积  操作人  备注  \n3    40.0  NaN NaN  \n4    40.0  NaN NaN  \n5    40.0  NaN NaN  \n6    40.0  NaN NaN  \n7    40.0  NaN NaN  \n..    ...  ...  ..  \n207  40.0  LZL NaN  \n208  40.0  LZL NaN  \n209  40.0  LZL NaN  \n210  40.0  LZL NaN  \n211  40.0  LZL NaN  \n\n[198 rows x 10 columns]>"
     },
     "metadata": {},
     "execution_count": 17
    }
   ],
   "source": [
    "capture_raw_filt = capture_raw.loc[capture_raw.loc[:, \"文库名称\"].isin(pooling_raw_filt.loc[:, \"文库名称\"]), :]\n",
    "capture_raw_filt.head"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "<bound method NDFrame.head of              虚拟入库样本名1        送测日期        上机时间        下机时间  \\\n0    IDTL130, IDTL131         NaN  2018/12/27  2018/12/29   \n1             IDTL142         NaN  2019/01/12  2019/01/14   \n2             IDTL145         NaN  2019/01/12  2019/01/14   \n3               LM001         NaN  2019/01/16  2019/01/18   \n4               LM002         NaN  2019/01/16  2019/01/19   \n..                ...         ...         ...         ...   \n149             LM284  2020/02/03  2020/02/04  2020/02/06   \n150             LM285  2020/02/03  2020/02/04  2020/02/06   \n151             LM286  2020/02/04  2020/02/05  2020/02/07   \n152             LM287  2020/02/04  2020/02/05  2020/02/07   \n153             LM289  2020/02/04  2020/02/05  2020/02/07   \n\n                              机器号                     芯片号  \\\n0                   R100400180038  V300012489, V300012514   \n1                   R100400180015  V300012566, V300012600   \n2                   R100400180007  V300012612, V300012618   \n3                   R100400180015  V300012535, V300012554   \n4                   R100400180004  V300012593, V300012614   \n..                            ...                     ...   \n149                 R100400170016  V300046611, V300044814   \n150  R100400180014, R100400180013  V300032757, V300032653   \n151                 R100400180038  V300044882, V300046512   \n152                 R100400180120  V300044876, V300044921   \n153                 R100400180025  V300044901, V300044737   \n\n                                              捕获文库名            测序编号  \n0    ssM-CpGiant-1-20181218, ssM-CpGiant-2-20181218  seq_20181227_1  \n1                            ssM-CpGiant-1-20190104  seq_20190112_1  \n2                            ssM-CpGiant-4-20190104  seq_20190112_2  \n3                            ssM-CpGiant-2-20190109  seq_20190116_1  \n4    ssM-CpGiant-3-20190109, ssM-CpGiant-4-20190109  seq_20190116_2  \n..                                              ...             ...  \n149                          ssM-CpGiant-3-20200131  seq_20200204_1  \n150                          ssM-CpGiant-4-20200131  seq_20200204_2  \n151                          ssM-CpGiant-5-20200131  seq_20200205_1  \n152  ssM-CpGiant-6-20200131, ssM-CpGiant-7-20200131  seq_20200205_2  \n153                          ssM-CpGiant-8-20200131  seq_20200205_3  \n\n[154 rows x 8 columns]>"
     },
     "metadata": {},
     "execution_count": 18
    }
   ],
   "source": [
    "seq_raw.head"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "<bound method NDFrame.head of          虚拟入库样本名1        送测日期        上机时间        下机时间  \\\n2         IDTL145         NaN  2019/01/12  2019/01/14   \n3           LM001         NaN  2019/01/16  2019/01/18   \n4           LM002         NaN  2019/01/16  2019/01/19   \n5           LM004         NaN  2019/01/21  2019/01/23   \n6    LM004, LM005         NaN  2019/01/26  2019/01/28   \n..            ...         ...         ...         ...   \n149         LM284  2020/02/03  2020/02/04  2020/02/06   \n150         LM285  2020/02/03  2020/02/04  2020/02/06   \n151         LM286  2020/02/04  2020/02/05  2020/02/07   \n152         LM287  2020/02/04  2020/02/05  2020/02/07   \n153         LM289  2020/02/04  2020/02/05  2020/02/07   \n\n                              机器号                     芯片号  \\\n2                   R100400180007  V300012612, V300012618   \n3                   R100400180015  V300012535, V300012554   \n4                   R100400180004  V300012593, V300012614   \n5                   R100400180004  V300012810, V300012556   \n6                   R100400180006              V300013650   \n..                            ...                     ...   \n149                 R100400170016  V300046611, V300044814   \n150  R100400180014, R100400180013  V300032757, V300032653   \n151                 R100400180038  V300044882, V300046512   \n152                 R100400180120  V300044876, V300044921   \n153                 R100400180025  V300044901, V300044737   \n\n                                                 捕获文库名            测序编号  \n2                               ssM-CpGiant-4-20190104  seq_20190112_2  \n3                               ssM-CpGiant-2-20190109  seq_20190116_1  \n4       ssM-CpGiant-3-20190109, ssM-CpGiant-4-20190109  seq_20190116_2  \n5       ssM-CpGiant-1-20190115, ssM-CpGiant-2-20190115  seq_20190121_1  \n6    ssM-CpGiant-1-20190115, ssM-CpGiant-2-20190115...  seq_20190126_1  \n..                                                 ...             ...  \n149                             ssM-CpGiant-3-20200131  seq_20200204_1  \n150                             ssM-CpGiant-4-20200131  seq_20200204_2  \n151                             ssM-CpGiant-5-20200131  seq_20200205_1  \n152     ssM-CpGiant-6-20200131, ssM-CpGiant-7-20200131  seq_20200205_2  \n153                             ssM-CpGiant-8-20200131  seq_20200205_3  \n\n[146 rows x 8 columns]>"
     },
     "metadata": {},
     "execution_count": 19
    }
   ],
   "source": [
    "filter = []\n",
    "for idx in range(seq_raw.shape[0]):\n",
    "    id_list = str(seq_raw.loc[idx, \"捕获文库名\"]).split(\", \")\n",
    "    match_mark = 0\n",
    "    for id in id_list:\n",
    "        id = id.strip()\n",
    "        if id in capture_raw_filt.loc[:, \"文库名称\"].to_string().strip():\n",
    "            match_mark = 1\n",
    "            break\n",
    "    if match_mark:\n",
    "        filter.append(True)\n",
    "    else:\n",
    "        filter.append(False)\n",
    "seq_raw_filt = seq_raw.loc[filter, :]\n",
    "seq_raw_filt.head"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "count = {}\n",
    "tmp = []\n",
    "for idx in range(qc_raw.shape[0]):\n",
    "    k = qc_raw.loc[idx, \"sampleID_shift\"]\n",
    "    count[k] = 1+count[k] if k in count else 1\n",
    "    tmp.append(\"{}_{}\".format(k, count[k]))\n",
    "qc_raw.loc[:, \"测序文库名\"] = tmp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "<bound method NDFrame.head of            Sample  Data_Size(Gb) Clean_Rate(%)  R1_Q20  R2_Q20  R1_Q30  \\\n0     18S4146666M          40.23        93.45%  97.42%  94.71%  90.07%   \n1     18S4146667M          37.36        93.36%  97.32%  94.48%  89.72%   \n2     18S4146668M          30.61        93.32%  97.21%  94.59%  89.35%   \n3     18S4146669M          32.03        93.65%  97.24%  94.85%  89.48%   \n4     18S4070982W          36.96        92.09%  97.48%  94.92%  89.78%   \n...           ...            ...           ...     ...     ...     ...   \n1936  20L5383610M          36.98        96.23%  97.99%  96.77%  91.41%   \n1937  20S5383649M          33.65        97.04%  97.99%  96.88%  91.39%   \n1938  20L5383609M          33.50        96.73%  97.86%  96.64%  90.98%   \n1939  20L5383614M          34.46        94.11%  97.40%  95.61%  89.58%   \n1940  20S5383654M          36.87        94.39%  97.85%  97.17%  91.04%   \n\n      R2_Q30 GC_Content BS_conversion_rate(lambda_DNA)  \\\n0     84.46%     30.91%                             -%   \n1     83.83%     31.43%                             -%   \n2     84.04%     30.19%                             -%   \n3     84.70%     30.75%                             -%   \n4     84.73%     25.84%                             -%   \n...      ...        ...                            ...   \n1936  88.66%     29.12%                         99.49%   \n1937  88.84%     28.66%                         99.49%   \n1938  88.18%     29.29%                         99.47%   \n1939  86.11%     29.34%                         99.46%   \n1940  90.18%     28.57%                         99.46%   \n\n     BS_conversion_rate(CHH)  ...  \\\n0                     99.43%  ...   \n1                     99.34%  ...   \n2                     99.42%  ...   \n3                     99.44%  ...   \n4                     -8.00%  ...   \n...                      ...  ...   \n1936                  99.12%  ...   \n1937                  99.24%  ...   \n1938                  99.26%  ...   \n1939                  99.19%  ...   \n1940                  99.13%  ...   \n\n                                             sample.txt  \\\n0     /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS-WGBS_20...   \n1     /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS-WGBS_20...   \n2     /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS-WGBS_20...   \n3     /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS-WGBS_20...   \n4     /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/WGBS_201812...   \n...                                                 ...   \n1936  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Tissue_2020...   \n1937  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Tissue_2020...   \n1938  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Tissue_2020...   \n1939  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Tissue_2020...   \n1940  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Tissue_2020...   \n\n                                             result.xls machine_chip  \\\n0     /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS-WGBS_20...          NaN   \n1     /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS-WGBS_20...          NaN   \n2     /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS-WGBS_20...          NaN   \n3     /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS-WGBS_20...          NaN   \n4     /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/WGBS_201812...          NaN   \n...                                                 ...          ...   \n1936  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Tissue_2020...          NaN   \n1937  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Tissue_2020...          NaN   \n1938  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Tissue_2020...          NaN   \n1939  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Tissue_2020...          NaN   \n1940  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Tissue_2020...          NaN   \n\n     seqId_list best_seqId        样本编号 empty_colums       date  \\\n0           NaN        NaN  18S4P46666           18 2018-12-03   \n1           NaN        NaN  18S4P46667           18 2018-12-03   \n2           NaN        NaN  18S4P46668           18 2018-12-03   \n3           NaN        NaN  18S4P46669           18 2018-12-03   \n4           NaN        NaN  18S4P70982           17 2018-12-21   \n...         ...        ...         ...          ...        ...   \n1936        NaN        NaN  20P5383610            3 2020-03-30   \n1937        NaN        NaN  20P5383649            3 2020-03-30   \n1938        NaN        NaN  20P5383609            3 2020-03-30   \n1939        NaN        NaN  20P5383614            3 2020-03-30   \n1940        NaN        NaN  20P5383654            3 2020-03-30   \n\n       sampleID_shift              测序文库名  \n0     18S4146666_1_m1  18S4146666_1_m1_1  \n1     18S4146667_1_m1  18S4146667_1_m1_1  \n2     18S4146668_1_m1  18S4146668_1_m1_1  \n3     18S4146669_1_m1  18S4146669_1_m1_1  \n4     18S4070982_1_m1  18S4070982_1_m1_1  \n...               ...                ...  \n1936  20L5383610_1_m1  20L5383610_1_m1_1  \n1937  20S5383649_1_m1  20S5383649_1_m1_1  \n1938  20L5383609_1_m1  20L5383609_1_m1_1  \n1939  20L5383614_1_m1  20L5383614_1_m1_1  \n1940  20S5383654_1_m1  20S5383654_1_m1_1  \n\n[1941 rows x 54 columns]>"
     },
     "metadata": {},
     "execution_count": 22
    }
   ],
   "source": [
    "qc_raw.head"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "<bound method NDFrame.head of            Sample  Data_Size(Gb) Clean_Rate(%)  R1_Q20  R2_Q20  R1_Q30  \\\n20    18P4063775M          33.83        90.11%  97.72%  93.98%  90.63%   \n24    18P4063823M          34.26        88.28%  97.83%  94.13%  91.06%   \n25    18P4062832M          29.38        89.68%  97.60%  93.85%  90.19%   \n27    18P4063819M          30.79        90.46%  97.62%  94.13%  90.35%   \n28    18P4063845M          32.48        90.66%  97.61%  94.01%  90.27%   \n...           ...            ...           ...     ...     ...     ...   \n1848  19P7247843M          37.42        93.36%  97.73%  96.36%  90.51%   \n1849  19P7247847M          44.69        95.63%  98.07%  97.10%  91.30%   \n1850  19P7247853M          43.26        95.02%  97.77%  96.98%  90.31%   \n1851  19P7247861M          37.82        94.15%  97.78%  96.56%  90.67%   \n1852  19P7247863M          35.73        93.61%  97.61%  96.48%  90.15%   \n\n      R2_Q30 GC_Content BS_conversion_rate(lambda_DNA)  \\\n20    83.42%     28.30%                         99.59%   \n24    83.80%     27.87%                         99.65%   \n25    83.13%     28.99%                         99.67%   \n27    83.79%     28.73%                         99.63%   \n28    83.53%     28.19%                         99.60%   \n...      ...        ...                            ...   \n1848  87.86%     31.44%                         99.46%   \n1849  88.97%     31.19%                         99.46%   \n1850  88.68%     31.07%                         99.47%   \n1851  88.46%     31.17%                         99.45%   \n1852  88.18%     31.67%                         99.47%   \n\n     BS_conversion_rate(CHH)  ...  \\\n20                    99.36%  ...   \n24                    99.38%  ...   \n25                    99.33%  ...   \n27                    99.36%  ...   \n28                    99.27%  ...   \n...                      ...  ...   \n1848                  99.24%  ...   \n1849                  99.27%  ...   \n1850                  99.28%  ...   \n1851                  99.24%  ...   \n1852                  99.25%  ...   \n\n                                             sample.txt  \\\n20    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019011...   \n24    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019011...   \n25    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019011...   \n27    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019011...   \n28    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019011...   \n...                                                 ...   \n1848  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n1849  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n1850  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n1851  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n1852  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n\n                                             result.xls  \\\n20    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019011...   \n24    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019011...   \n25    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019011...   \n27    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019011...   \n28    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019011...   \n...                                                 ...   \n1848  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n1849  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n1850  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n1851  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n1852  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n\n                                           machine_chip      seqId_list  \\\n20    R100400180007_V300012618, R100400180007_V30001...  seq_20190112_2   \n24    R100400180007_V300012618, R100400180007_V30001...  seq_20190112_2   \n25    R100400180007_V300012618, R100400180007_V30001...  seq_20190112_2   \n27    R100400180007_V300012618, R100400180007_V30001...  seq_20190112_2   \n28    R100400180007_V300012618, R100400180007_V30001...  seq_20190112_2   \n...                                                 ...             ...   \n1848  R100400180025_V300044901, R100400180025_V30004...  seq_20200205_3   \n1849  R100400180120_V300044921, R100400180120_V30004...  seq_20200205_2   \n1850  R100400180120_V300044921, R100400180120_V30004...  seq_20200205_2   \n1851  R100400180025_V300044901, R100400180025_V30004...  seq_20200205_3   \n1852  R100400180025_V300044901, R100400180025_V30004...  seq_20200205_3   \n\n          best_seqId        样本编号 empty_colums       date   sampleID_shift  \\\n20    seq_20190112_2  18P4P63775           14 2019-01-14  18P4063775_1_m1   \n24    seq_20190112_2  18P4P63823           14 2019-01-14  18P4063823_1_m1   \n25    seq_20190112_2  18P4P62832           14 2019-01-14  18P4062832_1_m1   \n27    seq_20190112_2  18P4P63819           14 2019-01-14  18P4063819_1_m1   \n28    seq_20190112_2  18P4P63845           14 2019-01-14  18P4063845_1_m1   \n...              ...         ...          ...        ...              ...   \n1848  seq_20200205_3  19P7247843            0 2020-02-07  19P7247843_1_m1   \n1849  seq_20200205_2  19P7247847            0 2020-02-07  19P7247847_1_m1   \n1850  seq_20200205_2  19P7247853            0 2020-02-07  19P7247853_1_m1   \n1851  seq_20200205_3  19P7247861            0 2020-02-07  19P7247861_1_m1   \n1852  seq_20200205_3  19P7247863            0 2020-02-07  19P7247863_1_m1   \n\n                  测序文库名  \n20    18P4063775_1_m1_1  \n24    18P4063823_1_m1_1  \n25    18P4062832_1_m1_1  \n27    18P4063819_1_m1_1  \n28    18P4063845_1_m1_1  \n...                 ...  \n1848  19P7247843_1_m1_1  \n1849  19P7247847_1_m1_1  \n1850  19P7247853_1_m1_1  \n1851  19P7247861_1_m1_1  \n1852  19P7247863_1_m1_1  \n\n[1064 rows x 54 columns]>"
     },
     "metadata": {},
     "execution_count": 23
    }
   ],
   "source": [
    "qc_raw_filt = qc_raw.loc[qc_raw.loc[:, \"best_seqId\"].isin(seq_raw_filt.loc[:,\"测序编号\"]), :]\n",
    "qc_raw_filt.head"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "<bound method NDFrame.head of            Sample  Data_Size(Gb) Clean_Rate(%)  R1_Q20  R2_Q20  R1_Q30  \\\n38    18P4063833M          27.31        92.69%  98.20%  95.94%  91.72%   \n42    18P4063839M          32.00        93.71%  98.18%  95.91%  91.54%   \n48    18P4063830M          24.08        92.50%  98.12%  95.17%  91.26%   \n53    19P0126529M          69.64        92.71%  98.21%  95.43%  91.72%   \n58    19P0126533M          60.09        88.71%  98.21%  95.48%  91.72%   \n...           ...            ...           ...     ...     ...     ...   \n1848  19P7247843M          37.42        93.36%  97.73%  96.36%  90.51%   \n1849  19P7247847M          44.69        95.63%  98.07%  97.10%  91.30%   \n1850  19P7247853M          43.26        95.02%  97.77%  96.98%  90.31%   \n1851  19P7247861M          37.82        94.15%  97.78%  96.56%  90.67%   \n1852  19P7247863M          35.73        93.61%  97.61%  96.48%  90.15%   \n\n      R2_Q30 GC_Content BS_conversion_rate(lambda_DNA)  \\\n38    87.33%     27.24%                         99.56%   \n42    87.21%     27.20%                         99.62%   \n48    85.31%     27.84%                         99.69%   \n53    85.97%     28.30%                            NaN   \n58    86.14%     27.17%                            NaN   \n...      ...        ...                            ...   \n1848  87.86%     31.44%                         99.46%   \n1849  88.97%     31.19%                         99.46%   \n1850  88.68%     31.07%                         99.47%   \n1851  88.46%     31.17%                         99.45%   \n1852  88.18%     31.67%                         99.47%   \n\n     BS_conversion_rate(CHH)  ...  \\\n38                    99.33%  ...   \n42                    99.45%  ...   \n48                    99.37%  ...   \n53                       NaN  ...   \n58                       NaN  ...   \n...                      ...  ...   \n1848                  99.24%  ...   \n1849                  99.27%  ...   \n1850                  99.28%  ...   \n1851                  99.24%  ...   \n1852                  99.25%  ...   \n\n                                             sample.txt  \\\n38    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019011...   \n42    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019011...   \n48    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019011...   \n53    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019012...   \n58    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019012...   \n...                                                 ...   \n1848  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n1849  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n1850  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n1851  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n1852  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n\n                                             result.xls  \\\n38    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019011...   \n42    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019011...   \n48    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019011...   \n53    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019012...   \n58    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019012...   \n...                                                 ...   \n1848  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n1849  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n1850  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n1851  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n1852  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n\n                                           machine_chip      seqId_list  \\\n38    R100400180004_V300012614, R100400180004_V30001...  seq_20190116_2   \n42    R100400180004_V300012614, R100400180004_V30001...  seq_20190116_2   \n48    R100400180015_V300012535, R100400180015_V30001...  seq_20190116_1   \n53    R100400180015_V300012621, R100400180015_V30001...  seq_20190121_2   \n58    R100400180015_V300012621, R100400180015_V30001...  seq_20190121_2   \n...                                                 ...             ...   \n1848  R100400180025_V300044901, R100400180025_V30004...  seq_20200205_3   \n1849  R100400180120_V300044921, R100400180120_V30004...  seq_20200205_2   \n1850  R100400180120_V300044921, R100400180120_V30004...  seq_20200205_2   \n1851  R100400180025_V300044901, R100400180025_V30004...  seq_20200205_3   \n1852  R100400180025_V300044901, R100400180025_V30004...  seq_20200205_3   \n\n          best_seqId        样本编号 empty_colums       date   sampleID_shift  \\\n38    seq_20190116_2  18P4063833           14 2019-01-18  18P4063833_1_m1   \n42    seq_20190116_2  18P4063839           14 2019-01-18  18P4063839_1_m1   \n48    seq_20190116_1  18P4063830           14 2019-01-18  18P4063830_1_m1   \n53    seq_20190121_2  19P0126529           20 2019-01-23  19P0126529_1_m1   \n58    seq_20190121_2  19P0126533           20 2019-01-23  19P0126533_1_m1   \n...              ...         ...          ...        ...              ...   \n1848  seq_20200205_3  19P7247843            0 2020-02-07  19P7247843_1_m1   \n1849  seq_20200205_2  19P7247847            0 2020-02-07  19P7247847_1_m1   \n1850  seq_20200205_2  19P7247853            0 2020-02-07  19P7247853_1_m1   \n1851  seq_20200205_3  19P7247861            0 2020-02-07  19P7247861_1_m1   \n1852  seq_20200205_3  19P7247863            0 2020-02-07  19P7247863_1_m1   \n\n                  测序文库名  \n38    18P4063833_1_m1_1  \n42    18P4063839_1_m1_1  \n48    18P4063830_1_m1_1  \n53    19P0126529_1_m1_1  \n58    19P0126533_1_m1_1  \n...                 ...  \n1848  19P7247843_1_m1_1  \n1849  19P7247847_1_m1_1  \n1850  19P7247853_1_m1_1  \n1851  19P7247861_1_m1_1  \n1852  19P7247863_1_m1_1  \n\n[912 rows x 54 columns]>"
     },
     "metadata": {},
     "execution_count": 33
    }
   ],
   "source": [
    "qc_raw_filt_subset1 = qc_raw_filt.loc[qc_raw_filt.loc[:, \"样本编号\"].isin(sampleId_unique_overlap), :]\n",
    "qc_raw_filt_subset1.head"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "<bound method NDFrame.head of            Sample  Data_Size(Gb) Clean_Rate(%)  R1_Q20  R2_Q20  R1_Q30  \\\n24    18P4063823M          34.26        88.28%  97.83%  94.13%  91.06%   \n28    18P4063845M          32.48        90.66%  97.61%  94.01%  90.27%   \n38    18P4063833M          27.31        92.69%  98.20%  95.94%  91.72%   \n42    18P4063839M          32.00        93.71%  98.18%  95.91%  91.54%   \n48    18P4063830M          24.08        92.50%  98.12%  95.17%  91.26%   \n...           ...            ...           ...     ...     ...     ...   \n1848  19P7247843M          37.42        93.36%  97.73%  96.36%  90.51%   \n1849  19P7247847M          44.69        95.63%  98.07%  97.10%  91.30%   \n1850  19P7247853M          43.26        95.02%  97.77%  96.98%  90.31%   \n1851  19P7247861M          37.82        94.15%  97.78%  96.56%  90.67%   \n1852  19P7247863M          35.73        93.61%  97.61%  96.48%  90.15%   \n\n      R2_Q30 GC_Content BS_conversion_rate(lambda_DNA)  \\\n24    83.80%     27.87%                         99.65%   \n28    83.53%     28.19%                         99.60%   \n38    87.33%     27.24%                         99.56%   \n42    87.21%     27.20%                         99.62%   \n48    85.31%     27.84%                         99.69%   \n...      ...        ...                            ...   \n1848  87.86%     31.44%                         99.46%   \n1849  88.97%     31.19%                         99.46%   \n1850  88.68%     31.07%                         99.47%   \n1851  88.46%     31.17%                         99.45%   \n1852  88.18%     31.67%                         99.47%   \n\n     BS_conversion_rate(CHH)  ...  \\\n24                    99.38%  ...   \n28                    99.27%  ...   \n38                    99.33%  ...   \n42                    99.45%  ...   \n48                    99.37%  ...   \n...                      ...  ...   \n1848                  99.24%  ...   \n1849                  99.27%  ...   \n1850                  99.28%  ...   \n1851                  99.24%  ...   \n1852                  99.25%  ...   \n\n                                             sample.txt  \\\n24    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019011...   \n28    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019011...   \n38    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019011...   \n42    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019011...   \n48    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019011...   \n...                                                 ...   \n1848  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n1849  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n1850  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n1851  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n1852  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n\n                                             result.xls  \\\n24    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019011...   \n28    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019011...   \n38    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019011...   \n42    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019011...   \n48    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019011...   \n...                                                 ...   \n1848  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n1849  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n1850  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n1851  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n1852  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n\n                                           machine_chip      seqId_list  \\\n24    R100400180007_V300012618, R100400180007_V30001...  seq_20190112_2   \n28    R100400180007_V300012618, R100400180007_V30001...  seq_20190112_2   \n38    R100400180004_V300012614, R100400180004_V30001...  seq_20190116_2   \n42    R100400180004_V300012614, R100400180004_V30001...  seq_20190116_2   \n48    R100400180015_V300012535, R100400180015_V30001...  seq_20190116_1   \n...                                                 ...             ...   \n1848  R100400180025_V300044901, R100400180025_V30004...  seq_20200205_3   \n1849  R100400180120_V300044921, R100400180120_V30004...  seq_20200205_2   \n1850  R100400180120_V300044921, R100400180120_V30004...  seq_20200205_2   \n1851  R100400180025_V300044901, R100400180025_V30004...  seq_20200205_3   \n1852  R100400180025_V300044901, R100400180025_V30004...  seq_20200205_3   \n\n          best_seqId        样本编号 empty_colums       date   sampleID_shift  \\\n24    seq_20190112_2  18P4P63823           14 2019-01-14  18P4063823_1_m1   \n28    seq_20190112_2  18P4P63845           14 2019-01-14  18P4063845_1_m1   \n38    seq_20190116_2  18P4063833           14 2019-01-18  18P4063833_1_m1   \n42    seq_20190116_2  18P4063839           14 2019-01-18  18P4063839_1_m1   \n48    seq_20190116_1  18P4063830           14 2019-01-18  18P4063830_1_m1   \n...              ...         ...          ...        ...              ...   \n1848  seq_20200205_3  19P7247843            0 2020-02-07  19P7247843_1_m1   \n1849  seq_20200205_2  19P7247847            0 2020-02-07  19P7247847_1_m1   \n1850  seq_20200205_2  19P7247853            0 2020-02-07  19P7247853_1_m1   \n1851  seq_20200205_3  19P7247861            0 2020-02-07  19P7247861_1_m1   \n1852  seq_20200205_3  19P7247863            0 2020-02-07  19P7247863_1_m1   \n\n                  测序文库名  \n24    18P4063823_1_m1_1  \n28    18P4063845_1_m1_1  \n38    18P4063833_1_m1_1  \n42    18P4063839_1_m1_1  \n48    18P4063830_1_m1_1  \n...                 ...  \n1848  19P7247843_1_m1_1  \n1849  19P7247847_1_m1_1  \n1850  19P7247853_1_m1_1  \n1851  19P7247861_1_m1_1  \n1852  19P7247863_1_m1_1  \n\n[898 rows x 54 columns]>"
     },
     "metadata": {},
     "execution_count": 34
    }
   ],
   "source": [
    "qc_raw_filt_subset2 = qc_raw_filt.loc[qc_raw_filt.loc[:, \"sampleID_shift\"].isin(library_raw_filt.loc[:, \"建库编号\"]), :]\n",
    "qc_raw_filt_subset2.head"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "<bound method NDFrame.head of            Sample  Data_Size(Gb) Clean_Rate(%)  R1_Q20  R2_Q20  R1_Q30  \\\n38    18P4063833M          27.31        92.69%  98.20%  95.94%  91.72%   \n42    18P4063839M          32.00        93.71%  98.18%  95.91%  91.54%   \n48    18P4063830M          24.08        92.50%  98.12%  95.17%  91.26%   \n53    19P0126529M          69.64        92.71%  98.21%  95.43%  91.72%   \n58    19P0126533M          60.09        88.71%  98.21%  95.48%  91.72%   \n...           ...            ...           ...     ...     ...     ...   \n1848  19P7247843M          37.42        93.36%  97.73%  96.36%  90.51%   \n1849  19P7247847M          44.69        95.63%  98.07%  97.10%  91.30%   \n1850  19P7247853M          43.26        95.02%  97.77%  96.98%  90.31%   \n1851  19P7247861M          37.82        94.15%  97.78%  96.56%  90.67%   \n1852  19P7247863M          35.73        93.61%  97.61%  96.48%  90.15%   \n\n      R2_Q30 GC_Content BS_conversion_rate(lambda_DNA)  \\\n38    87.33%     27.24%                         99.56%   \n42    87.21%     27.20%                         99.62%   \n48    85.31%     27.84%                         99.69%   \n53    85.97%     28.30%                            NaN   \n58    86.14%     27.17%                            NaN   \n...      ...        ...                            ...   \n1848  87.86%     31.44%                         99.46%   \n1849  88.97%     31.19%                         99.46%   \n1850  88.68%     31.07%                         99.47%   \n1851  88.46%     31.17%                         99.45%   \n1852  88.18%     31.67%                         99.47%   \n\n     BS_conversion_rate(CHH)  ...  \\\n38                    99.33%  ...   \n42                    99.45%  ...   \n48                    99.37%  ...   \n53                       NaN  ...   \n58                       NaN  ...   \n...                      ...  ...   \n1848                  99.24%  ...   \n1849                  99.27%  ...   \n1850                  99.28%  ...   \n1851                  99.24%  ...   \n1852                  99.25%  ...   \n\n                                             sample.txt  \\\n38    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019011...   \n42    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019011...   \n48    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019011...   \n53    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019012...   \n58    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019012...   \n...                                                 ...   \n1848  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n1849  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n1850  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n1851  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n1852  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n\n                                             result.xls  \\\n38    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019011...   \n42    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019011...   \n48    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019011...   \n53    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019012...   \n58    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019012...   \n...                                                 ...   \n1848  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n1849  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n1850  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n1851  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n1852  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n\n                                           machine_chip      seqId_list  \\\n38    R100400180004_V300012614, R100400180004_V30001...  seq_20190116_2   \n42    R100400180004_V300012614, R100400180004_V30001...  seq_20190116_2   \n48    R100400180015_V300012535, R100400180015_V30001...  seq_20190116_1   \n53    R100400180015_V300012621, R100400180015_V30001...  seq_20190121_2   \n58    R100400180015_V300012621, R100400180015_V30001...  seq_20190121_2   \n...                                                 ...             ...   \n1848  R100400180025_V300044901, R100400180025_V30004...  seq_20200205_3   \n1849  R100400180120_V300044921, R100400180120_V30004...  seq_20200205_2   \n1850  R100400180120_V300044921, R100400180120_V30004...  seq_20200205_2   \n1851  R100400180025_V300044901, R100400180025_V30004...  seq_20200205_3   \n1852  R100400180025_V300044901, R100400180025_V30004...  seq_20200205_3   \n\n          best_seqId        样本编号 empty_colums       date   sampleID_shift  \\\n38    seq_20190116_2  18P4063833           14 2019-01-18  18P4063833_1_m1   \n42    seq_20190116_2  18P4063839           14 2019-01-18  18P4063839_1_m1   \n48    seq_20190116_1  18P4063830           14 2019-01-18  18P4063830_1_m1   \n53    seq_20190121_2  19P0126529           20 2019-01-23  19P0126529_1_m1   \n58    seq_20190121_2  19P0126533           20 2019-01-23  19P0126533_1_m1   \n...              ...         ...          ...        ...              ...   \n1848  seq_20200205_3  19P7247843            0 2020-02-07  19P7247843_1_m1   \n1849  seq_20200205_2  19P7247847            0 2020-02-07  19P7247847_1_m1   \n1850  seq_20200205_2  19P7247853            0 2020-02-07  19P7247853_1_m1   \n1851  seq_20200205_3  19P7247861            0 2020-02-07  19P7247861_1_m1   \n1852  seq_20200205_3  19P7247863            0 2020-02-07  19P7247863_1_m1   \n\n                  测序文库名  \n38    18P4063833_1_m1_1  \n42    18P4063839_1_m1_1  \n48    18P4063830_1_m1_1  \n53    19P0126529_1_m1_1  \n58    19P0126533_1_m1_1  \n...                 ...  \n1848  19P7247843_1_m1_1  \n1849  19P7247847_1_m1_1  \n1850  19P7247853_1_m1_1  \n1851  19P7247861_1_m1_1  \n1852  19P7247863_1_m1_1  \n\n[896 rows x 54 columns]>"
     },
     "metadata": {},
     "execution_count": 35
    }
   ],
   "source": [
    "qc_raw_filt_subset3 = qc_raw_filt_subset1.loc[qc_raw_filt_subset1.loc[:, \"sampleID_shift\"].isin(library_raw_filt.loc[:, \"建库编号\"]), :]\n",
    "qc_raw_filt_subset3.head"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "<bound method NDFrame.head of            Sample  Data_Size(Gb) Clean_Rate(%)  R1_Q20  R2_Q20  R1_Q30  \\\n38    18P4063833M          27.31        92.69%  98.20%  95.94%  91.72%   \n42    18P4063839M          32.00        93.71%  98.18%  95.91%  91.54%   \n48    18P4063830M          24.08        92.50%  98.12%  95.17%  91.26%   \n53    19P0126529M          69.64        92.71%  98.21%  95.43%  91.72%   \n58    19P0126533M          60.09        88.71%  98.21%  95.48%  91.72%   \n...           ...            ...           ...     ...     ...     ...   \n1847  19P7247859M          38.99        94.28%  97.69%  96.85%  90.18%   \n1848  19P7247843M          37.42        93.36%  97.73%  96.36%  90.51%   \n1849  19P7247847M          44.69        95.63%  98.07%  97.10%  91.30%   \n1850  19P7247853M          43.26        95.02%  97.77%  96.98%  90.31%   \n1852  19P7247863M          35.73        93.61%  97.61%  96.48%  90.15%   \n\n      R2_Q30 GC_Content BS_conversion_rate(lambda_DNA)  \\\n38    87.33%     27.24%                         99.56%   \n42    87.21%     27.20%                         99.62%   \n48    85.31%     27.84%                         99.69%   \n53    85.97%     28.30%                            NaN   \n58    86.14%     27.17%                            NaN   \n...      ...        ...                            ...   \n1847  88.35%     31.69%                         99.48%   \n1848  87.86%     31.44%                         99.46%   \n1849  88.97%     31.19%                         99.46%   \n1850  88.68%     31.07%                         99.47%   \n1852  88.18%     31.67%                         99.47%   \n\n     BS_conversion_rate(CHH)  ...  \\\n38                    99.33%  ...   \n42                    99.45%  ...   \n48                    99.37%  ...   \n53                       NaN  ...   \n58                       NaN  ...   \n...                      ...  ...   \n1847                  99.25%  ...   \n1848                  99.24%  ...   \n1849                  99.27%  ...   \n1850                  99.28%  ...   \n1852                  99.25%  ...   \n\n                                             sample.txt  \\\n38    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019011...   \n42    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019011...   \n48    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019011...   \n53    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019012...   \n58    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019012...   \n...                                                 ...   \n1847  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n1848  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n1849  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n1850  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n1852  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n\n                                             result.xls  \\\n38    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019011...   \n42    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019011...   \n48    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019011...   \n53    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019012...   \n58    /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/TBS_2019012...   \n...                                                 ...   \n1847  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n1848  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n1849  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n1850  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n1852  /zfssz3/BC_RD_P3/BC_RDC5/pengjiaxi/Plasma_2020...   \n\n                                           machine_chip      seqId_list  \\\n38    R100400180004_V300012614, R100400180004_V30001...  seq_20190116_2   \n42    R100400180004_V300012614, R100400180004_V30001...  seq_20190116_2   \n48    R100400180015_V300012535, R100400180015_V30001...  seq_20190116_1   \n53    R100400180015_V300012621, R100400180015_V30001...  seq_20190121_2   \n58    R100400180015_V300012621, R100400180015_V30001...  seq_20190121_2   \n...                                                 ...             ...   \n1847  R100400180120_V300044921, R100400180120_V30004...  seq_20200205_2   \n1848  R100400180025_V300044901, R100400180025_V30004...  seq_20200205_3   \n1849  R100400180120_V300044921, R100400180120_V30004...  seq_20200205_2   \n1850  R100400180120_V300044921, R100400180120_V30004...  seq_20200205_2   \n1852  R100400180025_V300044901, R100400180025_V30004...  seq_20200205_3   \n\n          best_seqId        样本编号 empty_colums       date   sampleID_shift  \\\n38    seq_20190116_2  18P4063833           14 2019-01-18  18P4063833_1_m1   \n42    seq_20190116_2  18P4063839           14 2019-01-18  18P4063839_1_m1   \n48    seq_20190116_1  18P4063830           14 2019-01-18  18P4063830_1_m1   \n53    seq_20190121_2  19P0126529           20 2019-01-23  19P0126529_1_m1   \n58    seq_20190121_2  19P0126533           20 2019-01-23  19P0126533_1_m1   \n...              ...         ...          ...        ...              ...   \n1847  seq_20200205_2  19P7247859            0 2020-02-07  19P7247859_1_m1   \n1848  seq_20200205_3  19P7247843            0 2020-02-07  19P7247843_1_m1   \n1849  seq_20200205_2  19P7247847            0 2020-02-07  19P7247847_1_m1   \n1850  seq_20200205_2  19P7247853            0 2020-02-07  19P7247853_1_m1   \n1852  seq_20200205_3  19P7247863            0 2020-02-07  19P7247863_1_m1   \n\n                  测序文库名  \n38    18P4063833_1_m1_1  \n42    18P4063839_1_m1_1  \n48    18P4063830_1_m1_1  \n53    19P0126529_1_m1_1  \n58    19P0126533_1_m1_1  \n...                 ...  \n1847  19P7247859_1_m1_1  \n1848  19P7247843_1_m1_1  \n1849  19P7247847_1_m1_1  \n1850  19P7247853_1_m1_1  \n1852  19P7247863_1_m1_1  \n\n[895 rows x 54 columns]>"
     },
     "metadata": {},
     "execution_count": 36
    }
   ],
   "source": [
    "qc_raw_filt_subset4 = qc_raw_filt_subset3.loc[qc_raw_filt_subset3.loc[:, \"测序文库名\"].isin(pooling_raw_filt.loc[:, \"测序文库名\"]), :]\n",
    "qc_raw_filt_subset4.head"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "# idx1_2 = [x for x in list(qc_raw_filt_subset1.index) if x not in list(qc_raw_filt_subset2.index)]\n",
    "# qc_raw_filt.loc[idx1_2, :]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "with pd.ExcelWriter(\"final/qc.20200430.xlsx\") as writer:\n",
    "    qc_raw_filt.to_excel(writer, sheet_name=\"result\", index=False)\n",
    "    qc_raw_filt_subset4.to_excel(writer, sheet_name=\"filtBy_clinicalAndLibraryAndPooling\", index=False)\n",
    "seq_raw_filt.to_excel(\"final/sequence.20200430.xlsx\", index=False)\n",
    "pooling_raw_filt.to_excel(\"final/pooling.20200430.xlsx\", index=False)\n",
    "capture_raw_filt.to_excel(\"final/capture.20200430.xlsx\", index=False)\n",
    "library_raw_filt.to_excel(\"final/library.20200430.xlsx\", index=False)\n",
    "extract_raw_filt.to_excel(\"final/extract.20200430.xlsx\", index=False)\n",
    "clinical_raw_filt.to_excel(\"final/clinical.20200430.xlsx\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7-final"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "python36764bit590e0209625c41dab6b137cee9f052d1",
   "display_name": "Python 3.6.7 64-bit"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}